{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"HF_ENDPOINT\"] = \"https://hf-mirror.com\"\n",
    "\n",
    "from datasets import load_dataset\n",
    "\n",
    "ds = load_dataset(\"Kangheng/refcocog\",cache_dir=\"./cache\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    test: Dataset({\n",
       "        features: ['question_id', 'question', 'image', 'bbox', 'image_size'],\n",
       "        num_rows: 9602\n",
       "    })\n",
       "    val: Dataset({\n",
       "        features: ['question_id', 'question', 'image', 'bbox', 'image_size'],\n",
       "        num_rows: 4896\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question_id': 8,\n",
       " 'question': 'the man in yellow coat',\n",
       " 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x376>,\n",
       " 'bbox': '[374, 65, 510, 266]',\n",
       " 'image_size': [640, 376]}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds['test'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing items: 100%|██████████| 14498/14498 [05:19<00:00, 45.44it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data has been saved to data_test.jsonl and data_val.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from PIL import Image\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 假设 ds 已经定义且包含 'test' 和 'val' 的数据\n",
    "\n",
    "# 计算总的数据量\n",
    "total_items = sum(len(ds[data_key]) for data_key in ['test', 'val'])\n",
    "\n",
    "# 初始化进度条\n",
    "with tqdm(total=total_items, desc=\"Processing items\") as pbar:\n",
    "    # 分别处理 test 和 val 数据集，并写入不同的文件\n",
    "    for data_key in ['test', 'val']:\n",
    "        # 定义输出文件名\n",
    "        output_file = f'data_{data_key}.jsonl'\n",
    "        \n",
    "        # 打开输出文件\n",
    "        with open(output_file, 'w', encoding='utf-8') as f:\n",
    "            for item in ds[data_key]:\n",
    "                # 创建保存图片的目录\n",
    "                save_dir = 'dataset'\n",
    "                os.makedirs(save_dir, exist_ok=True)\n",
    "\n",
    "                # 根据数据集类型确定文件名前缀\n",
    "                prefix = 'test-' if data_key == 'test' else 'val-'\n",
    "\n",
    "                # 定义保存图片的路径（确保文件扩展名为.jpeg）\n",
    "                image_name = f'{prefix}{item[\"question_id\"]}.jpeg'\n",
    "                save_path = os.path.join(save_dir, image_name)\n",
    "\n",
    "                # 保存图片为JPEG格式\n",
    "                item['image'].save(save_path, format='JPEG')\n",
    "\n",
    "                # 构造新的数据格式\n",
    "                new_data_format = [\n",
    "                    {\n",
    "                        \"role\": \"user\",\n",
    "                        \"content\": [\n",
    "                            {\n",
    "                                \"type\": \"image\",\n",
    "                                \"image\": f'dataset/{image_name}',\n",
    "                            },\n",
    "                            {\n",
    "                                \"type\": \"text\",\n",
    "                                \"text\": f'Please provide the bounding box for the following description: {item[\"question\"]}',\n",
    "                            },\n",
    "                        ],\n",
    "                    },\n",
    "                    {\n",
    "                        \"role\": \"assistant\",\n",
    "                        \"content\": f'<|object_ref_start|>{item[\"question\"]}<|object_ref_end|> is located at <|box_start|>{item[\"bbox\"]}<|box_end|>'\n",
    "                    }\n",
    "                ]\n",
    "\n",
    "                # 将 new_data_format 对象写入 JSON Lines 文件\n",
    "                f.write(json.dumps(new_data_format, ensure_ascii=False) + '\\n')\n",
    "\n",
    "                # 更新进度条\n",
    "                pbar.update(1)\n",
    "\n",
    "print(\"Data has been saved to data_test.jsonl and data_val.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "loaded_data_test = []\n",
    "loaded_data_val = []\n",
    "with open('data_test.jsonl', 'r', encoding='utf-8') as file:\n",
    "    for line in file:\n",
    "        loaded_data_test.append(json.loads(line))\n",
    "with open('data_val.jsonl', 'r', encoding='utf-8') as file:\n",
    "    for line in file:\n",
    "        loaded_data_val.append(json.loads(line))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'role': 'user',\n",
       "  'content': [{'type': 'image', 'image': 'dataset/test-8.jpeg'},\n",
       "   {'type': 'text',\n",
       "    'text': 'Please provide the bounding box for the following description: the man in yellow coat'}]},\n",
       " {'role': 'assistant',\n",
       "  'content': '<|object_ref_start|>the man in yellow coat<|object_ref_end|> is located at <|box_start|>[374, 65, 510, 266]<|box_end|>'}]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loaded_data_test[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(list,\n",
       " [{'role': 'user',\n",
       "   'content': [{'type': 'image', 'image': 'dataset/test-8.jpeg'},\n",
       "    {'type': 'text',\n",
       "     'text': 'Please provide the bounding box for the following description: the man in yellow coat'}]},\n",
       "  {'role': 'assistant',\n",
       "   'content': '<|object_ref_start|>the man in yellow coat<|object_ref_end|> is located at <|box_start|>[374, 65, 510, 266]<|box_end|>'}])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(loaded_data),loaded_data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test and Val Datasets have been created.\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from datasets import Dataset\n",
    "\n",
    "def load_and_convert_data(file_path):\n",
    "    \"\"\"加载并转换数据\"\"\"\n",
    "    loaded_data = []\n",
    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
    "        for line in file:\n",
    "            loaded_data.append(json.loads(line))\n",
    "\n",
    "    # 将 loaded_data 转换为适合 Dataset 的格式\n",
    "    dataset_dicts = []\n",
    "    for item in loaded_data:\n",
    "        user_content = item[0]['content']\n",
    "        assistant_content = item[1]['content']\n",
    "\n",
    "        # 提取图像和文本信息\n",
    "        image_info = next((x for x in user_content if x['type'] == 'image'), None)\n",
    "        text_info = next((x for x in user_content if x['type'] == 'text'), None)\n",
    "\n",
    "        # 构造新的字典\n",
    "        dataset_entry = {\n",
    "            'role': 'user',\n",
    "            'image_path': image_info['image'] if image_info else None,\n",
    "            'question': text_info['text'] if text_info else None,\n",
    "            'assistant_answer': assistant_content\n",
    "        }\n",
    "        \n",
    "        dataset_dicts.append(dataset_entry)\n",
    "    \n",
    "    return dataset_dicts\n",
    "\n",
    "# 分别加载 test 和 val 数据集\n",
    "test_data_path = 'data_test.jsonl'\n",
    "val_data_path = 'data_val.jsonl'\n",
    "\n",
    "test_dataset_dicts = load_and_convert_data(test_data_path)\n",
    "val_dataset_dicts = load_and_convert_data(val_data_path)\n",
    "\n",
    "# 创建 Dataset 对象\n",
    "test_dataset = Dataset.from_list(test_dataset_dicts)\n",
    "val_dataset = Dataset.from_list(val_dataset_dicts)\n",
    "\n",
    "print(\"Test and Val Datasets have been created.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "({'role': 'user',\n",
       "  'image_path': 'dataset/test-8.jpeg',\n",
       "  'question': 'Please provide the bounding box for the following description: the man in yellow coat',\n",
       "  'assistant_answer': '<|object_ref_start|>the man in yellow coat<|object_ref_end|> is located at <|box_start|>[374, 65, 510, 266]<|box_end|>'},\n",
       " {'role': 'user',\n",
       "  'image_path': 'dataset/val-61.jpeg',\n",
       "  'question': 'Please provide the bounding box for the following description: a bush of plant behind middle woman',\n",
       "  'assistant_answer': '<|object_ref_start|>a bush of plant behind middle woman<|object_ref_end|> is located at <|box_start|>[285, 23, 424, 146]<|box_end|>'})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'role': 'user', 'image_path': 'dataset/val-61.jpeg', 'question': 'Please provide the bounding box for the following description: a bush of plant behind middle woman', 'assistant_answer': '<|object_ref_start|>a bush of plant behind middle woman<|object_ref_end|> is located at <|box_start|>[285, 23, 424, 146]<|box_end|>'}\n",
      "dict_keys(['role', 'image_path', 'question', 'assistant_answer'])\n",
      "user\n",
      "\n",
      "{'role': 'user', 'image_path': 'dataset/val-62.jpeg', 'question': \"Please provide the bounding box for the following description: green plant behind a table visible behind a lady ' s head\", 'assistant_answer': \"<|object_ref_start|>green plant behind a table visible behind a lady ' s head<|object_ref_end|> is located at <|box_start|>[285, 23, 424, 146]<|box_end|>\"}\n",
      "dict_keys(['role', 'image_path', 'question', 'assistant_answer'])\n",
      "user\n",
      "\n",
      "{'role': 'user', 'image_path': 'dataset/val-127.jpeg', 'question': 'Please provide the bounding box for the following description: black and white dog with pointy ears', 'assistant_answer': '<|object_ref_start|>black and white dog with pointy ears<|object_ref_end|> is located at <|box_start|>[183, 68, 309, 166]<|box_end|>'}\n",
      "dict_keys(['role', 'image_path', 'question', 'assistant_answer'])\n",
      "user\n",
      "\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "for item in val_dataset:\n",
    "    print(item)\n",
    "    print(item.keys())\n",
    "    print(item['image_path'])\n",
    "    print()\n",
    "    i = i+1\n",
    "    if i == 3:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
